import pandas as pd
import docx
import docx
import re


# 获取文档对象
file = docx.Document(r"C:\Users\kf40\Desktop\智能提取信息\sample01.docx")
print("段落数: "+str(len(file.paragraphs))  )  # 段落数为13每个回车隔离一段


tables = file.tables
keys =['基金名称',"基金简称","场内简称","基金主代码",'基金类别','基金管理人','基金托管人','基金合同生效日','基金运作方式','业绩比较基准','投资目标','投资范围','投资比例','基金分红','风险收益特征','风险等级']
result ={}
for i in range(len(tables)):
    tb = tables[i]
    # 获取表格的行
    tb_rows = tb.rows
    # 读取每一行内容
    for i in range(len(tb_rows)):
        row_data =[]
        row_cells = tb_rows[i].cells
        # 读取每一行单元格内容
        for cell in row_cells:
            # 单元格内容
            row_data.append(cell.text.replace(" " ,""))
        if row_data[0] in keys:
            result[row_data[0]] = row_data[1]
# display(result)

df = pd.DataFrame(pd.Series(result), columns=['content'])
df = df.rename(index={'基金主代码':"基金代码"})
temp = result["基金名称"]

if "竞争力指数发起式证券投资基金" in temp:
    result["基金类别"] = "股票型投资基金"
if "混合型" in temp:
    if "FOF" in temp:
        result["基金类别"] = "混合型基金中基金"
    else:
        result["基金类别"] = "混合型证券投资基金"
if "债券型" in temp:
    result["基金类别"] = "债券型证券投资基金"
if "货币市场" in temp:
    result["基金类别"] = "货币市场基金"


# 获取文档对象
file=docx.Document(r"C:\Users\kf40\Desktop\智能提取信息\sample01.docx")
print("段落数: " +str(len(file.paragraphs))  )  # 段落数为13，每个回车隔离一段

# #输出每一段的内容
# for para in file.paragraphs:
#     print(para.text)

# 输出段落编号及段落内容
for i in range(len(file.paragraphs)):
    #     print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)
    temp = file.paragraphs[i].text
    if '管理人对报告期内基金利润分配情况的说明' in temp:
        start = i
    if '报告期内管理人对本基金持有人数或基金资产净值预警情形的说明' in temp:
        end = i
text = ""
for i in range(start + 1, end-2):
    text = text + file.paragraphs[i].text

text = re.sub("第.*页.*年度报告","",text)
result["基金分红"] = text

# 输出段落编号及段落内容
for i in range(len(file.paragraphs)):
    #     print("第"+str(i)+"段的内容是："+file.paragraphs[i].text)
    temp = file.paragraphs[i].text
    if '根据《中华人民共和国证券投资基金法》和' in temp:
        start = i
        print(temp)
    if '本财务报表由基金管理人上海东方证券资产管理有限公司于' in temp:
        end = i
        print(temp)

text = ""
for i in range(start,end+1):
    text = text + file.paragraphs[i].text
text = re.sub("第.*页.*年度报告","",text)
text1 = re.findall("本基金的投资范围.+可以将其纳入投资范围。",text)
text1 = str(text1)
text2 = re.findall("可以将其纳入投资范围。(.*?)本财务报表由基金管理人",text)
text2 = str(text2)
result["投资范围"] = text1
result["投资比例"] = text2

pd.set_option('display.max_colwidth', -1)

df = pd.DataFrame(pd.Series(result), columns=['content'])
df = df.rename(index={'基金主代码':"基金代码"})
df = df.reindex \
    (['基金名称' ,"基金简称" ,"场内简称" ,"基金代码" ,'基金类别' ,'基金管理人' ,'基金托管人' ,'基金合同生效日' ,'基金运作方式' ,'业绩比较基准' ,'投资目标' ,'投资范围' ,'投资比例'
     ,'基金分红' ,'风险收益特征' ,'风险等级'])
df = df.dropna()
# display(df)
name = result['基金简称']
path = 'C:\\Users\\kf40\\Desktop\\智能提取信息\\'+ name +'.xlsx'
df.to_excel(path)